Load libraries
library(ggplot2)
library(tidyr)
library(dplyr)
library(reshape2)
# For ARIMA
library(astsa)
library(forecast)
# For SIR
library(deSolve)
library(dfoptim)
Import data
import_data <- function(file) {
df <- read.csv(file, header = FALSE, skip = 1)
locations <- as.matrix(df[,1])
dates <- as.matrix(read.csv(file, header = FALSE, nrows = 1)[1,-1])
df <- as.data.frame(t(df[,-1]))
rownames(df) <- dates
colnames(df) <- locations
# print(sapply(df, class))
return(df)
}
cases_intl <- import_data("data/International/International_covid_cases_data.csv")
deaths_intl <- import_data("data/International/International_covid_deaths_data.csv")
cases_LA <- import_data("data/LA/LA_cities_covid_data.csv")
cases_US <- import_data("data/US_Counties/US_county_covid_cases_data.csv")
deaths_US <- import_data("data/US_Counties/US_county_covid_deaths_data.csv")
overall_df <- function(df) {
return(data.frame(days=seq(0,length(rownames(df))-1),
cases=rowSums(df[,-1])))
}
overall_plot <- function(df, title) {
ggplot(data=df, aes(x=days, y=cases, group=1)) +
geom_line() +
ggtitle(title)
}
cases_LA_overall <- overall_df(cases_LA)
cases_US_overall <- overall_df(cases_US)
cases_intl_overall <- overall_df(cases_intl)
overall_plot(cases_LA_overall, "Overall LA Cases")
overall_plot(cases_US_overall, "Overall US Cases")
overall_plot(cases_intl_overall, "Overall International Cases")
plot_all <- function(df, title, legend = FALSE) {
rownames(df) <- seq(0,length(rownames(df))-1)
m <- melt(as.matrix(df))
colnames(m) <- c("Days", "Location", "Count")
ggplot(m, aes(x=Days, y=Count, color=Location)) +
geom_line(show.legend = legend) +
scale_colour_viridis_d() +
ggtitle(title)
}
max_counts <- function(df) {
rownames(df) <- seq(0,length(rownames(df))-1)
m <- melt(as.matrix(df))
colnames(m) <- c("Days", "Location", "Count")
return(m %>% group_by(Location) %>% slice_max(n=1, order_by=Count, with_ties = FALSE) %>% ungroup())
}
plot_all(cases_intl, "International Cases")
cases_intl_max <- max_counts(cases_intl)
cases_intl_max <- cases_intl_max[order(cases_intl_max$Count, decreasing = TRUE),]
cases_intl_max
## # A tibble: 188 x 3
## Days Location Count
## <int> <fct> <int>
## 1 176 US 3576157
## 2 176 Brazil 2012151
## 3 176 India 1003832
## 4 176 Russia 751612
## 5 176 Peru 341586
## 6 176 South Africa 324221
## 7 176 Mexico 324041
## 8 176 Chile 323698
## 9 176 United Kingdom 294116
## 10 176 Iran 267061
## # … with 178 more rows
plot_all(cases_US, "US County Cases")
cases_US_max <- max_counts(cases_US)
cases_US_max <- cases_US_max[order(cases_US_max$Count, decreasing = TRUE),]
cases_US_max
## # A tibble: 3,209 x 3
## Days Location Count
## <int> <fct> <int>
## 1 174 New York City, New York 223977
## 2 174 Los Angeles, California 136129
## 3 174 Cook, Illinois 95884
## 4 174 Maricopa, Arizona 81216
## 5 174 Miami-Dade, Florida 67712
## 6 174 Harris, Texas 47369
## 7 174 Nassau, New York 42354
## 8 174 Suffolk, New York 42112
## 9 174 Westchester, New York 35326
## 10 174 Dallas, Texas 34914
## # … with 3,199 more rows
plot_all(cases_LA, "LA Cases")
cases_LA_max <- max_counts(cases_LA)
cases_LA_max <- cases_LA_max[order(cases_LA_max$Count, decreasing = TRUE),]
cases_LA_max
## # A tibble: 339 x 3
## Days Location Count
## <int> <fct> <int>
## 1 104 Unincorporated - East Los Angeles 3327
## 2 104 City of South Gate 2432
## 3 104 Los Angeles - Boyle Heights* 2375
## 4 104 City of Downey 2349
## 5 104 City of Pomona 2292
## 6 104 City of El Monte 2244
## 7 104 City of Compton 2049
## 8 104 Unincorporated - Castaic* 1822
## 9 104 City of Lynwood* 1812
## 10 104 Unincorporated - Florence-Firestone 1804
## # … with 329 more rows
plot_all(select(cases_intl, cases_intl_max$Location[1:10]), "International Cases Top 10", legend=TRUE)
plot_all(select(cases_US, cases_US_max$Location[1:10]), "US County Cases Top 10", legend=TRUE)
# locations with nice curves
# plot_all(select(cases_US, `New York City, New York`, `Suffolk, New York`, `Cook, Illinois`), "US County Cases Selected Plots", legend = TRUE)
# Los Angeles
# plot_all(select(cases_US, `Los Angeles, California`), "Los Angeles, California")
plot_all(select(cases_LA, cases_LA_max$Location[1:10]), "LA Cases Top 10", legend=TRUE)
arima_wrap <- function(df, location, percentage = 0.7, verbose = TRUE) {
timeseries = unlist(as.list(select(df, location)))
if(verbose) {
plot.ts(timeseries)
plot.ts(diff(timeseries))
acf(diff(timeseries))
}
train_series <- timeseries[1:length(timeseries)*percentage]
test_series <- timeseries[-1:-length(timeseries)*percentage]
# parametrize model
AutoArimaModel=auto.arima(train_series)
AutoArimaModel
# train
futurVal <- forecast(AutoArimaModel,h=10, level=c(99.5))
futurVal <- forecast(AutoArimaModel,h=50, level=c(80)) #adjust timesteps ahead, and confidence level
checkresiduals(AutoArimaModel)
plot(futurVal)
return(futurVal)
}
futurSuffolk <- arima_wrap(cases_US, "Suffolk, New York")
## Note: Using an external vector in selections is ambiguous.
## ℹ Use `all_of(location)` instead of `location` to silence this message.
## ℹ See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.
##
## Ljung-Box test
##
## data: Residuals from ARIMA(3,2,0)
## Q* = 95.378, df = 7, p-value < 2.2e-16
##
## Model df: 3. Total lags used: 10
futurLA <- arima_wrap(cases_LA_overall, "cases")
##
## Ljung-Box test
##
## data: Residuals from ARIMA(3,2,1)
## Q* = 22.686, df = 6, p-value = 0.0009089
##
## Model df: 4. Total lags used: 10
futurUS <- arima_wrap(cases_US_overall, "cases")
##
## Ljung-Box test
##
## data: Residuals from ARIMA(5,2,0)
## Q* = 120.98, df = 5, p-value < 2.2e-16
##
## Model df: 5. Total lags used: 10
futurIntl <- arima_wrap(cases_intl_overall, "cases")
##
## Ljung-Box test
##
## data: Residuals from ARIMA(2,2,2)
## Q* = 70.454, df = 6, p-value = 3.3e-13
##
## Model df: 4. Total lags used: 10
forecasts = c(2,7,30)
futur <- list(futurSuffolk$mean[forecasts], futurUS$mean[forecasts], futurLA$mean[forecasts], futurIntl$mean[forecasts])
futur <- data.frame(matrix(unlist(futur), nrow=length(futur), byrow=T),stringsAsFactors=FALSE)
rownames(futur) <- c("Suffolk, New York", "All LA Cities", "All US Counties", "All International")
colnames(futur) <- c("2 Days", "1 Week", "1 Month")
futur
## 2 Days 1 Week 1 Month
## Suffolk, New York 38666.99 39054.25 40752.36
## All LA Cities 1611784.46 1703922.52 2084156.34
## All US Counties 66611.48 71241.45 91621.37
## All International 5511041.30 5826711.49 7442012.61
Infected <- cases_US$`Los Angeles, California`[-1:-25] # Suffolk, New York
Days <- seq(1,length(Infected))
#plot(Days, Infected)
SIR <- function(time, state, parameters) {
par <- as.list(c(state, parameters))
with(par, {
dS <- -beta * I * S / N
dI <- beta * I * S / N - gamma * I
dR <- gamma * I
list(c(dS, dI, dR))
})
}
N <- 3.99e6 #1.477e6
init <- c(
S = N - Infected[1],
I = Infected[1],
R = 0
)
# define a function to calculate the residual sum of squares
# (RSS), passing in parameters beta and gamma that are to be
# optimised for the best fit to the incidence data
RSS <- function(parameters) {
names(parameters) <- c("beta", "gamma")
out <- ode(y = init, times = Days, func = SIR, parms = parameters)
fit <- out[, 3]
sum((Infected - fit)^2)
}
# now find the values of beta and gamma that give the
# smallest RSS, which represents the best fit to the data.
# Start with values of 0.5 for each, and constrain them to
# the interval 0 to 1.0
lower = c(0, 0)
upper = c(1, 1)
Opt <- optim(c(0.5, 0.5),
RSS,
method = "L-BFGS-B",
lower = lower,
upper = upper
)
Opt$message
## [1] "CONVERGENCE: REL_REDUCTION_OF_F <= FACTR*EPSMCH"
# Opt <-nmkb(c(0.5, 0.5), RSS, lower=lower, upper=upper)
# Opt$message
# library(Rvmmin)
# Opt <- Rvmmin(c(0.5, 0.5), RSS, lower=lower, upper=upper)
# Opt$message
Opt_par <- setNames(Opt$par, c("beta", "gamma"))
Opt_par
## beta gamma
## 0.4079545 0.3115256
Opt_par[1] = 0.38
Opt_par[2] = 0.29
# get the fitted values from our SIR model
fitted_cumulative_incidence <- data.frame(ode(
y = init, times = Days,
func = SIR, parms = Opt_par
))
fitted_cumulative_incidence <- fitted_cumulative_incidence %>%
mutate(
cumulative_incident_cases = Infected
)
fitted_cumulative_incidence %>%
ggplot(aes(x = time)) +
geom_line(aes(y = I), colour = "red") +
geom_point(aes(y = cumulative_incident_cases), colour = "blue") +
labs(
y = "Cumulative incidence",
title = "COVID-19 fitted vs observed cumulative incidence, Los Angeles, California",
subtitle = "(Red = fitted from SIR model, blue = observed)"
) +
theme_minimal()
R0 <- as.numeric(Opt_par[1] / Opt_par[2])
R0
## [1] 1.310345